import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import requests
import datetime
import folium
from branca.element import Figure
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
ccrb = pd.read_csv('https://raw.githubusercontent.com/new-york-civil-liberties-union/NYPD-Misconduct-Complaint-Database-Updated/main/CCRB%20Complaint%20Database%20Raw%2004.20.2021.csv')
ccrb = ccrb.drop('AsOfDate',axis=1)
ccrb = ccrb.loc[ccrb['IncidentDate'].notnull()]
ccrb['IncidentDate'] = ccrb['IncidentDate'].apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y').strftime('%Y'))
ccrb_2019 = ccrb.loc[ccrb['IncidentDate']=='2019']
#ccrb_2019.to_csv('CCRB.csv',index=False)
#Sometimes multiple allegations were in one complaint, this DF creates one allegation for each event.
byDate = ccrb_2019.drop_duplicates(
subset = ['OfficerID', 'DaysOnForce', 'ComplaintID'],
keep = 'last').reset_index(drop = True)
/Users/Dmorley2044/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3146: DtypeWarning: Columns (10,11,12,18,23,24,25,26,27,28,29,30,31) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
#Cleaning Dataset/Selecting Only Arrests From 2019:
#NYPD_arrests = pd.read_csv('NYPD_Arrest_Data__Historic.csv')
#NYPD_arrests['ARREST_DATE'] = NYPD_arrests['ARREST_DATE'].apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y').strftime('%Y')) #CHANGE DATE TO YEARS
#arrests_2019 = NYPD_arrests.loc[NYPD_arrests['ARREST_DATE']=='2019']
#Fixing Borough Names for our 2019 Dataset
#def value_change():
#arrests_2019.loc[arrests_2019['ARREST_BORO']=='M','ARREST_BORO']='Manhattan'
#arrests_2019.loc[arrests_2019['ARREST_BORO']=='B','ARREST_BORO']='Bronx'
#arrests_2019.loc[arrests_2019['ARREST_BORO']=='Q','ARREST_BORO']='Queens'
#arrests_2019.loc[arrests_2019['ARREST_BORO']=='K','ARREST_BORO']='Brooklyn'
#arrests_2019.loc[arrests_2019['ARREST_BORO']=='S','ARREST_BORO']='Staten Island'
#value_change()
#arrests_2019.to_csv('NYPD_Arrest_2019.csv',index=False)
URL = 'https://www.census.gov/quickfacts/fact/table/newyorkcitynewyork,bronxcountybronxboroughnewyork,kingscountybrooklynboroughnewyork,newyorkcountymanhattanboroughnewyork,queenscountyqueensboroughnewyork,richmondcountystatenislandboroughnewyork/PST045219'
page = requests.get(URL)
soup = BeautifulSoup(page.content)
soupString = str(soup)
soupString = soupString.replace('\n', '')
#There are the identifying codes used by the Census Bureau for NYCs boroughs
Bronx = '36005'
Brooklyn = '36047'
Manhattan = '36061'
Queens = '36081'
Staten = '36085'
boroughs = [Brooklyn, Manhattan, Queens, Bronx, Staten]
Population = []
PercentBlack = []
PercentWhite = []
PercentHL = []
PercentAsian = []
PopOver18 = []
PercentOther=[]
for i in boroughs:
popSearch = 'id="PST045219" tab.*?geoid="' + i + '.*?value="(\w+)'
blkSearch = 'title="Black or African American alone.*?geoid="' + i + '.*?value="(.......)'
whiteSearch = 'n>White alone, no.*?geoid="' + i + '.*?value="(.......)'
HisLatSearch = '="Hispanics may b.*?geoid="' + i + '.*?value="(.......)'
asnSearch = 'title="Asian alone.*?geoid="' + i + '.*?value="(.......)'
pop18Search = 'n>Persons under 18.*?geoid="' + i + '.*?value="(.......)'
pop = int(re.search(popSearch, soupString).group(1))
Population.append(pop)
result18 = float(re.search(pop18Search, soupString).group(1))
pop18 = int((1-(result18*.01)) * pop)
PopOver18.append(pop18)
#The Census's ACS is an estimation, so the percentages often add up to over 100%.
#The following corrects for this and adds to the table
blkResult = float(re.search(blkSearch, soupString).group(1))
whtResult = float(re.search(whiteSearch, soupString).group(1))
hlResult = float(re.search(HisLatSearch, soupString).group(1))
asnResult = float(re.search(asnSearch, soupString).group(1))
tot = blkResult+whtResult+hlResult+asnResult
whtcor = round((whtResult*100)/tot, 2)
blkcor = round((blkResult*100)/tot,2)
asncor = round((asnResult*100)/tot,2)
hlcor = round((hlResult*100)/tot,2)
PercentHL.append(hlcor * .01)
PercentAsian.append(asncor * .01)
PercentBlack.append(blkcor *.01)
PercentWhite.append(whtcor * .01)
CensusDF = pd.DataFrame()
CensusDF["Borough"] = ["Brooklyn", "Manhattan", "Queens", "Bronx", "Staten Island"]
#CensusDF["Population"] = Population
#CensusDF["Population Over 18"] = PopOver18
CensusDF["Asian"] = PercentAsian
CensusDF["Hispanic/Latino"] = PercentHL
CensusDF["Black"] = PercentBlack
CensusDF["White"] = PercentWhite
#CensusDF
CensusDF = CensusDF.set_index("Borough")
CensusDF
| Asian | Hispanic/Latino | Black | White | |
|---|---|---|---|---|
| Borough | ||||
| Brooklyn | 0.1240 | 0.1850 | 0.3309 | 0.3601 |
| Manhattan | 0.1242 | 0.2474 | 0.1720 | 0.4565 |
| Queens | 0.2674 | 0.2800 | 0.2056 | 0.2470 |
| Bronx | 0.0402 | 0.4964 | 0.3841 | 0.0793 |
| Staten Island | 0.1086 | 0.1847 | 0.1154 | 0.5912 |
Creating a Dataframe containing the number of complaints per precinct in NYC:
incidents = ccrb_2019['IncidentPrecinct'].value_counts().reset_index()
incidents = incidents.drop(labels=76,axis=0).sort_values(by='index')
incidents = incidents.reset_index()
incidents = incidents.drop('level_0',axis=1)
incidents.columns = ['Precinct','Count']
incidents
| Precinct | Count | |
|---|---|---|
| 0 | PCT 001 | 83 |
| 1 | PCT 005 | 24 |
| 2 | PCT 006 | 42 |
| 3 | PCT 007 | 80 |
| 4 | PCT 009 | 87 |
| ... | ... | ... |
| 72 | PCT 115 | 88 |
| 73 | PCT 120 | 175 |
| 74 | PCT 121 | 68 |
| 75 | PCT 122 | 69 |
| 76 | PCT 123 | 120 |
77 rows × 2 columns
Creating a function to add Borough names to our CCRB Dataset:
manhattan = incidents['Precinct'][:22]
bronx = incidents['Precinct'][22:34]
brooklyn = incidents['Precinct'][34:57]
queens = incidents['Precinct'][57:73]
staten_island = incidents['Precinct'][73:77]
#Defining our Boroughs:
def borough():
ccrb_2019.loc[ccrb_2019['IncidentPrecinct'].isin(manhattan),'Borough']='Manhattan'
ccrb_2019.loc[ccrb_2019['IncidentPrecinct'].isin(bronx),'Borough']='Bronx'
ccrb_2019.loc[ccrb_2019['IncidentPrecinct'].isin(brooklyn),'Borough']='Brooklyn'
ccrb_2019.loc[ccrb_2019['IncidentPrecinct'].isin(queens),'Borough']='Queens'
ccrb_2019.loc[ccrb_2019['IncidentPrecinct'].isin(staten_island),'Borough']='Staten Island'
borough()
/Users/Dmorley2044/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py:1596: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self.obj[key] = _infer_fill_value(value) /Users/Dmorley2044/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py:1765: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy isetter(loc, value)
Finding rows for individual races, and combining them into a dataframe labeled "borough_df":
#Finding rows where ImpactedRace = Black,White,Asian,Hispanic
b = ccrb_2019.loc[ccrb_2019['ImpactedRace']=='Black']
w = ccrb_2019.loc[ccrb_2019['ImpactedRace']=='White']
a = ccrb_2019.loc[ccrb_2019['ImpactedRace']=='Asian']
h = ccrb_2019.loc[ccrb_2019['ImpactedRace']=='Hispanic']
#Creating a dataframe for misconduct against black citizens
black = b['Borough'].value_counts().reset_index()
black.columns = ['Borough','Count']
#Creating a dataframe for misconduct against white citizens
white = w['Borough'].value_counts().reset_index()
white.columns = ['Borough','Count']
#Creating a dataframe for misconduct against asian citizens
asian = a['Borough'].value_counts().reset_index()
asian.columns = ['Borough','Count']
#Creating a dataframe for misconduct against hispanic citizens
hispanic = h['Borough'].value_counts().reset_index()
hispanic.columns = ['Borough','Count']
#Creating a final dataframe containing the percentage of complaints per race
borough_totals = {'Borough':["Brooklyn", "Manhattan", "Queens", "Bronx", "Staten Island"],'Asian':asian['Count'],'Black':black['Count'],'Hispanic':hispanic['Count'],'White':white['Count']}
borough_df = pd.DataFrame(borough_totals)
borough_df = borough_df.set_index('Borough')
borough_per = borough_df.div(borough_df.sum(axis=1), axis=0)
Read in NYPD Arrest 2019 CSV File:
arrest_df = pd.read_csv('NYPD_Arrest_2019.csv')
#arrest_df.head()
/Users/Dmorley2044/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3146: DtypeWarning: Columns (19) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
Top Perptrator Gender Category
PerpGenderDF = arrest_df['PERP_SEX'].value_counts(ascending=False).to_frame()
PerpGenderDF.index.name='Gender Type'
PerpGenderDF.rename(index={'M':'Male', 'F':'Female'},columns={'PERP_SEX':'Gender Total'}).style.set_properties(**{'text-align':'center'})
| Gender Total | |
|---|---|
| Gender Type | |
| Male | 175317 |
| Female | 39300 |
Top Perptrator Race Category
PerpRaceDF = arrest_df['PERP_RACE'].value_counts(ascending=False).to_frame()
PerpRaceDF.index.name ='Race Type'
PerpRaceDF.rename(columns={'PERP_RACE':"Race Total"}).style.set_properties(**{'text-align':'center'})
| Race Total | |
|---|---|
| Race Type | |
| BLACK | 102585 |
| WHITE HISPANIC | 53675 |
| WHITE | 25478 |
| BLACK HISPANIC | 18755 |
| ASIAN / PACIFIC ISLANDER | 12104 |
| UNKNOWN | 1423 |
| AMERICAN INDIAN/ALASKAN NATIVE | 597 |
Top 10 Crimes Reported
arrest_df['ARREST_BORO'].value_counts().sort_values().plot(kind="barh", color = 'indianred', title = 'Number of Arrests vs. Borough', edgecolor="black");
plt.xlabel('Number of Arrests');
plt.ylabel('Borough');
arrest_df.OFNS_DESC.value_counts().iloc[:10].sort_values().plot(kind="barh", title="Types of Crimes", edgecolor='black',color = 'indianred');
plt.xlabel('Number of Arrests');
plt.ylabel('Type of Crime');
Creating a new dataframe from our Arrest Dataset to have the same races as our Census Data:
races = ['WHITE','WHITE HISPANIC','BLACK','BLACK HISPANIC','ASIAN / PACIFIC ISLANDER']
hispanic = ['WHITE HISPANIC','BLACK HISPANIC']
arrest_df_races = arrest_df.loc[arrest_df['PERP_RACE'].isin(races)]
arrest_df_races.loc[arrest_df_races['PERP_RACE'].isin(hispanic),'PERP_RACE'] = 'HISPANIC'
Bar Graph of Most Common Race Arrested Based on Borough
colors = ['maroon','indianred','salmon','peachpuff']
arrest_df_races.groupby('ARREST_BORO')['PERP_RACE'].value_counts().unstack().plot.barh(stacked= True,color=colors,edgecolor="black");
plt.legend(loc="upper center",bbox_to_anchor=(1.45, 1), ncol=2);
plt.title('Race Of Perpetrator Arrested by Borough',size=14);
plt.ylabel('Borough');
plt.xlabel('Number of Arrests');
Using the requests library and BeautifulSoup to scrape the U.S. Census Bureau for New York City:
URL = 'https://www.census.gov/quickfacts/fact/table/bronxcountybronxboroughnewyork,kingscountybrooklynboroughnewyork,newyorkcountymanhattanboroughnewyork,queenscountyqueensboroughnewyork,richmondcountystatenislandboroughnewyork/PST045219'
page = requests.get(URL)
soup = BeautifulSoup(page.content)
soupString = str(soup)
soupString = soupString.replace('\n', '')
arrestDF2019 = pd.read_csv('NYPD_Arrest_2019.csv')
boroughs = ["Brooklyn", "Manhattan", "Queens", "Bronx", "Staten Island"]
whtArrL = []
blkArrL = []
asnArrL = []
hisArrL = []
for i in boroughs:
blkArr = len(arrestDF2019[(arrestDF2019['PERP_RACE']=='BLACK') & (arrestDF2019['ARREST_BORO'] == i)])
whtArr = len(arrestDF2019[(arrestDF2019['PERP_RACE']=='WHITE')&(arrestDF2019['ARREST_BORO']== i)])
asnArr = len(arrestDF2019[(arrestDF2019['PERP_RACE']=='ASIAN / PACIFIC ISLANDER')&(arrestDF2019['ARREST_BORO']==i)])
hisArr = len(arrestDF2019[(arrestDF2019['ARREST_BORO']==i)&((arrestDF2019['PERP_RACE']=='WHITE HISPANIC')|(arrestDF2019['PERP_RACE']=='BLACK HISPANIC'))])
total=blkArr+whtArr+asnArr+hisArr
blkArrL.append(round(blkArr/total,4))
hisArrL.append(round(hisArr/total,4))
whtArrL.append(round(whtArr/total,4))
asnArrL.append(round(asnArr/total,4))
ArrestsByRaceDF = pd.DataFrame()
ArrestsByRaceDF["Borough"] = boroughs
ArrestsByRaceDF["Asian"] = asnArrL
ArrestsByRaceDF["Hispanic/Latin"] = hisArrL
ArrestsByRaceDF["Black"] = blkArrL
ArrestsByRaceDF["White"] = whtArrL
ArrestsByRaceDF
| Borough | Asian | Hispanic/Latin | Black | White | |
|---|---|---|---|---|---|
| 0 | Brooklyn | 0.0428 | 0.2361 | 0.5917 | 0.1294 |
| 1 | Manhattan | 0.0395 | 0.3335 | 0.4795 | 0.1475 |
| 2 | Queens | 0.1486 | 0.3406 | 0.3919 | 0.1188 |
| 3 | Bronx | 0.0130 | 0.4906 | 0.4585 | 0.0379 |
| 4 | Staten Island | 0.0396 | 0.2491 | 0.3694 | 0.3419 |
Creating a list of our boroughs, and empty lists for each item we are scraping from census:
Bronx = '36005'
Brooklyn = '36047'
Manhattan = '36061'
Queens = '36081'
Staten = '36085'
boroughs = [Brooklyn, Manhattan, Queens, Bronx, Staten]
Population = []
PercentBlack = []
PercentWhite = []
PercentHL = []
PercentAsian = []
PopOver18 = []
Using a for loop, as well as Regex to scrape information from the NYC Census webpage:
for i in boroughs:
popSearch = 'id="PST045219" tab.*?geoid="' + i + '.*?value="(\w+)'
blkSearch = 'title="Black or African American alone.*?geoid="' + i + '.*?value="(.......)'
whiteSearch = 'n>White alone, no.*?geoid="' + i + '.*?value="(.......)'
HisLatSearch = '="Hispanics may b.*?geoid="' + i + '.*?value="(.......)'
asnSearch = 'title="Asian alone.*?geoid="' + i + '.*?value="(.......)'
pop18Search = 'n>Persons under 18.*?geoid="' + i + '.*?value="(.......)'
pop = int(re.search(popSearch, soupString).group(1))
Population.append(pop)
result18 = float(re.search(pop18Search, soupString).group(1))
pop18 = int((1-(result18*.01)) * pop)
PopOver18.append(pop18)
blkResult = float(re.search(blkSearch, soupString).group(1))
blkPop = int(pop * blkResult *.01)
PercentBlack.append(blkPop)
whtResult = float(re.search(whiteSearch, soupString).group(1))
whtPop = int(pop * whtResult * .01)
PercentWhite.append(whtPop)
hlResult = float(re.search(HisLatSearch, soupString).group(1))
hlPop = int(pop * hlResult * .01)
PercentHL.append(hlPop)
asnResult = float(re.search(asnSearch, soupString).group(1))
asnPop = int(pop * asnResult * .01)
PercentAsian.append(asnPop)
Creating a Dataframe to contain all the scraped census information:
CensusDFRaw = pd.DataFrame()
CensusDFRaw["Borough"] = ["Brooklyn", "Manhattan", "Queens", "Bronx", "Staten Island"]
CensusDFRaw["Hispanic/Latin"] = PercentHL
CensusDFRaw["White"] = PercentWhite
CensusDFRaw["Black"] = PercentBlack
CensusDFRaw["Asian"] = PercentAsian
CensusDFRaw
| Borough | Hispanic/Latin | White | Black | Asian | |
|---|---|---|---|---|---|
| 0 | Brooklyn | 483678 | 941266 | 864837 | 324188 |
| 1 | Manhattan | 416758 | 769097 | 289722 | 209226 |
| 2 | Queens | 634932 | 560133 | 466212 | 606355 |
| 3 | Bronx | 799579 | 127782 | 618555 | 64685 |
| 4 | Staten Island | 88637 | 283690 | 55363 | 52124 |
Analyzing the number of complaints per officer:
complaintCountdf = ccrb_2019['ShieldNo'].value_counts(ascending=False).to_frame().rename_axis('ComplaintCount')
#Removing badge#0, is likely in error
complaintCountdf=complaintCountdf.drop(0)
#Sometimes multiple allegations were in one complaint.
byDate = ccrb_2019.drop_duplicates(
subset = ['OfficerID', 'DaysOnForce', 'ComplaintID'],
keep = 'last').reset_index(drop = True)
byDateComplaints = byDate['ShieldNo'].value_counts(ascending=False).to_frame().rename_axis('ComplaintCount')
byDateComplaints=byDateComplaints.drop(0)
byDateComplaints.head(8)
| ShieldNo | |
|---|---|
| ComplaintCount | |
| 4105 | 9 |
| 25053 | 8 |
| 663 | 8 |
| 608 | 7 |
| 10909 | 6 |
| 12924 | 6 |
| 2680 | 6 |
| 2049 | 5 |
Creating Two Pie Charts to Display the Number of Officers vs. Complaints:
#36000 is the approximate number of officers employed by the NYPD
totOff = 36000
noCom = 36000-len(byDateComplaints)
com = len(byDateComplaints)
y = np.array([noCom, com])
fig1, ax1 = plt.subplots(figsize=(4,6))
plt.title('Officers with Complaints vs. No Complaints',size=14);
wedgeprops = {"linewidth": 2, 'width':1, "edgecolor":"k"};
explode = (0,0.3);
ax1.pie(y,colors=['skyblue','indianred'],wedgeprops=wedgeprops,explode=explode,shadow=True,autopct='%1.1f%%');
labels = ['Officers with no complaints','Officers with a complaint']
ax1.legend(labels, loc="best");
Over3 = byDateComplaints.loc[byDateComplaints['ShieldNo']>=3]
two = byDateComplaints.loc[byDateComplaints['ShieldNo']==2]
one = byDateComplaints.loc[byDateComplaints['ShieldNo']==1]
x = np.array([len(one), len(two), len(Over3)])
fig1, ax2 = plt.subplots(figsize=(4,6))
colors = ['indianred','salmon','peachpuff']
wedgeprops = {"linewidth": 2, 'width':1, "edgecolor":"k"};
plt.title('Officers With Complaints',size=14)
ax2.pie(x,colors = colors,wedgeprops=wedgeprops,autopct='%1.1f%%')
labels = ['1 Complaint','2 Complaints','3+ Complaints']
ax2.legend(labels, loc='upper right');
Histogram Showing the Breakdown of Officers with Complaints:
byDateComplaints.plot.hist(ylim=(0,3000), bins=9,color='indianred',
figsize=(5,3),edgecolor='black');
plt.title('Officers by Complaints',size=14);
plt.xlabel('Number of Complaints');
plt.ylabel('Number of Officers');
label=['Officers'];
plt.legend(label);
colors = ['maroon','indianred','salmon','peachpuff']
borough_df.plot.bar(stacked = True, figsize=(8, 8),color=colors,edgecolor='black');
plt.legend(loc="best",bbox_to_anchor=(1.40, 1), ncol=2);
plt.title('Complaints by Race',size=14);
plt.xlabel('Borough');
plt.ylabel('Complaint Quantity');
CensusDF.head()
| Asian | Hispanic/Latino | Black | White | |
|---|---|---|---|---|
| Borough | ||||
| Brooklyn | 0.1240 | 0.1850 | 0.3309 | 0.3601 |
| Manhattan | 0.1242 | 0.2474 | 0.1720 | 0.4565 |
| Queens | 0.2674 | 0.2800 | 0.2056 | 0.2470 |
| Bronx | 0.0402 | 0.4964 | 0.3841 | 0.0793 |
| Staten Island | 0.1086 | 0.1847 | 0.1154 | 0.5912 |
sns.set_theme()
CensusDF = CensusDF.rename(columns={'Hispanic/Latino':'Hispanic/Latin'})
ArrHeatDF = ArrestsByRaceDF.set_index('Borough')
ArrHeatDF = ArrHeatDF.subtract(CensusDF)
ArrHeatDF = ArrHeatDF.round(decimals=2)
ax = sns.heatmap(ArrHeatDF, center=0)
ArrHeatDF
| Asian | Hispanic/Latin | Black | White | |
|---|---|---|---|---|
| Borough | ||||
| Brooklyn | -0.08 | 0.05 | 0.26 | -0.23 |
| Manhattan | -0.08 | 0.09 | 0.31 | -0.31 |
| Queens | -0.12 | 0.06 | 0.19 | -0.13 |
| Bronx | -0.03 | -0.01 | 0.07 | -0.04 |
| Staten Island | -0.07 | 0.06 | 0.25 | -0.25 |
Created heatmap comparing arrest percentage by race to complaint percentage by race
comHeat = borough_per
comHeat = comHeat.rename(columns={'Hispanic':'Hispanic/Latin'})
comHeat = comHeat.subtract(ArrestsByRaceDF.set_index('Borough'))
ax = sns.heatmap(comHeat, center=0)
comHeat = comHeat.round(decimals=2)
comHeat
| Asian | Black | Hispanic/Latin | White | |
|---|---|---|---|---|
| Borough | ||||
| Brooklyn | -0.01 | 0.02 | 0.01 | -0.02 |
| Manhattan | -0.00 | 0.07 | -0.08 | 0.02 |
| Queens | -0.11 | 0.15 | -0.07 | 0.04 |
| Bronx | 0.03 | 0.07 | -0.24 | 0.14 |
| Staten Island | 0.05 | -0.11 | 0.05 | 0.00 |
nypd_URL = 'https://www1.nyc.gov/site/nypd/bureaus/patrol/precincts-landing.page'
precinct_page = requests.get(nypd_URL)
print(precinct_page.status_code)
200
Writing our HTML response to a file labeled "Precinct_file.txt"
precinct_file = open('Precinct_file.txt','w')
precinct_file.writelines(precinct_page.text)
precinct_file.close()
Opening our Precinct file and creating a BeautifulSoup Object:
f = open('Precinct_file.txt','r')
html = f.read()
soup_PD = BeautifulSoup(html,'html.parser')
#print(soup_PD.prettify())
Extracting portion of HTML code for Precinct Numbers:
precinct_number = soup_PD.findAll('td',{'data-label':"Precinct"})
#precinct_number
Since the addresses on the page don't provide the precinct's city, state, or zip-code, we need to create another request for the individual hyper-links on each precinct.
The code below pulls the desired links for each of our precincts.
address_links = []
for item in precinct_number:
for link in item.find_all("a"):
if 'href' in link.attrs:
address_links.append(link.attrs['href'])
#print(address_links)
Creates a list called "html_list" and adds the html from our request object for each of the hyper-links.
base_url = 'https://www1.nyc.gov/'
html_list = []
for line in address_links:
request_links = requests.get(base_url + line)
html_list.append(request_links.text)
Creates a list containing the city, state, and zip code for each precinct using the regex code below.
nypd_precinct_zip = []
for line in html_list:
nypd_precinct_zip.append(re.findall(r"<p>[\w&\s/;,.-]+<br />(.*)</p>",line))
Pulls information from our Beautiful Soup object to give us the street address for each precinct.
precinct_address = soup_PD.findAll('td',{'data-label':"Address"})
#precinct_address
Code using Regex to pull the precinct numbers from our precinct_number BeautifulSoup object.
precincts = re.findall('[\w\s]+\s\w{8}',str(precinct_number))
#precincts
Code using Regex that pulls the street addresses from our precinct_address BeautifulSoup object.
NOTE: there were issues with one address in particular due to the ampersand, which I have manually corrected as seen below.
addresses = re.findall('>([\w&\s/;.-]+)<',str(precinct_address))
addresses[12] = '86th St & Transverse Road'
#addresses
Creates a dictionary containing the precinct numbers and their respective street addresses and creates a dataframe out of this dictionary.
precinct_dictionary = {'Number':precincts,'Address':addresses}
precinct_df = pd.DataFrame(precinct_dictionary)
precinct_df.head()
| Number | Address | |
|---|---|---|
| 0 | 1st Precinct | 16 Ericsson Place |
| 1 | 5th Precinct | 19 Elizabeth Street |
| 2 | 6th Precinct | 233 West 10 Street |
| 3 | 7th Precinct | 19 1/2 Pitt Street |
| 4 | 9th Precinct | 321 East 5 Street |
Using the precinct hyper-links, we pulled the city,state, and zipcode for each and added them to this dataframe here.
zip_df = pd.DataFrame(nypd_precinct_zip,columns=['Zip'])
#zip_df
Next, we add our two dataframes together to get full addresses for all of our NYPD precincts.
final_df = pd.concat([precinct_df,zip_df],axis=1)
final_df['Address'] = final_df['Address'] + ', ' + final_df['Zip']
precinct_address_df = final_df.drop(['Zip'],axis=1)
precinct_address_df.head()
#precinct_address_df.to_csv('final_precinct.csv',index=False) #OPTIONAL EXPORT PRECINCT DATA
| Number | Address | |
|---|---|---|
| 0 | 1st Precinct | 16 Ericsson Place, New York, NY, 10013 |
| 1 | 5th Precinct | 19 Elizabeth Street, New York, NY, 10013 |
| 2 | 6th Precinct | 233 West 10 Street, New York, NY, 10014 |
| 3 | 7th Precinct | 19 1/2 Pitt Street, New York, NY, 10002 |
| 4 | 9th Precinct | 321 East 5 Street, New York, NY, 10003 |
Using the Google API, we can loop through our precinct addresses and get their respective latitude and longitude coordinates.
coordinates = []
for line in precinct_address_df['Address']:
address = line
key = "AIzaSyAwqsiGFUJy9skg4zKLQZJTSzEYZbcEHlE"
html = 'https://maps.googleapis.com/maps/api/geocode/json?address=' + address + "&key=" + key
response = requests.get(html)
resp_json_payload = response.json()
coordinates.append(resp_json_payload['results'][0]['geometry']['location'])
Code below creates a new dataframe containing the latitude and longitude points from our Google API.
coordinates_df = pd.DataFrame(coordinates)
coordinates_df.head()
| lat | lng | |
|---|---|---|
| 0 | 40.720270 | -74.007198 |
| 1 | 40.716194 | -73.997470 |
| 2 | 40.734233 | -74.005453 |
| 3 | 40.716371 | -73.984059 |
| 4 | 40.726559 | -73.987824 |
Code below concatenates our Precinct address dataframe with our new coordinate dataframe to complete our location dataframe.
police_precinct_location_df = pd.concat([precinct_address_df,coordinates_df],axis=1)
police_precinct_location_df.head()
| Number | Address | lat | lng | |
|---|---|---|---|---|
| 0 | 1st Precinct | 16 Ericsson Place, New York, NY, 10013 | 40.720270 | -74.007198 |
| 1 | 5th Precinct | 19 Elizabeth Street, New York, NY, 10013 | 40.716194 | -73.997470 |
| 2 | 6th Precinct | 233 West 10 Street, New York, NY, 10014 | 40.734233 | -74.005453 |
| 3 | 7th Precinct | 19 1/2 Pitt Street, New York, NY, 10002 | 40.716371 | -73.984059 |
| 4 | 9th Precinct | 321 East 5 Street, New York, NY, 10003 | 40.726559 | -73.987824 |
Combine our Incidents dataframe with our precinct location dataframe to complete our dataset.
police_misconduct = pd.concat([police_precinct_location_df,incidents],axis=1)
police_misconduct = police_misconduct.drop('Precinct',axis=1)
police_misconduct.columns = ['Precinct','Address','Latitude','Longitude','# of Incidents']
police_misconduct
| Precinct | Address | Latitude | Longitude | # of Incidents | |
|---|---|---|---|---|---|
| 0 | 1st Precinct | 16 Ericsson Place, New York, NY, 10013 | 40.720270 | -74.007198 | 83 |
| 1 | 5th Precinct | 19 Elizabeth Street, New York, NY, 10013 | 40.716194 | -73.997470 | 24 |
| 2 | 6th Precinct | 233 West 10 Street, New York, NY, 10014 | 40.734233 | -74.005453 | 42 |
| 3 | 7th Precinct | 19 1/2 Pitt Street, New York, NY, 10002 | 40.716371 | -73.984059 | 80 |
| 4 | 9th Precinct | 321 East 5 Street, New York, NY, 10003 | 40.726559 | -73.987824 | 87 |
| ... | ... | ... | ... | ... | ... |
| 72 | 115th Precinct | 92-15 Northern Boulevard, Jackson Hgts., NY, 1... | 40.756875 | -73.875490 | 88 |
| 73 | 120th Precinct | 78 Richmond Terrace, St. George, NY, 10301-1905 | 40.644655 | -74.077415 | 175 |
| 74 | 121st Precinct | 970 Richmond Avenue, Staten Island, NY, 10314-... | 40.623676 | -74.150343 | 68 |
| 75 | 122nd Precinct | 2320 Hylan Boulevard, New Dorp, NY, 10306-3207 | 40.574118 | -74.105506 | 69 |
| 76 | 123rd Precinct | 116 Main Street, Staten Island, NY, 10307-1227 | 40.511844 | -74.250036 | 120 |
77 rows × 5 columns
Creating a Map of NYC Showing the Number of Complaints in Folium
#Creating our NYC Folium Map
fig=Figure(width=550,height=350)
nyc=folium.Map(width=550,height=350,location=[40.693943, -73.985880],zoom_start=10,min_zoom=8,max_zoom=12,tiles='cartodbpositron')
fig.add_child(nyc)
#Adding our Borough Borders:
data_url = ('https://raw.githubusercontent.com/dwillis/nyc-maps/master/boroughs.geojson')
folium.GeoJson(data_url).add_to(nyc)
#Adding Borough Markers:
folium.Marker(
location=[40.7831, -73.9712],
popup="Manhattan",
icon=folium.Icon(color="red", icon="info-sign"),
).add_to(nyc)
folium.Marker(
location=[40.6782, -73.9442],
popup="Brookyln",
icon=folium.Icon(color="red", icon="info-sign"),
).add_to(nyc)
folium.Marker(
location=[40.8448, -73.8648],
popup="Bronx",
icon=folium.Icon(color="red", icon="info-sign"),
).add_to(nyc)
folium.Marker(
location=[40.7282, -73.7949],
popup="Queens",
icon=folium.Icon(color="red", icon="info-sign"),
).add_to(nyc)
folium.Marker(
location=[40.5795, -74.1502],
popup="Staten Island",
icon=folium.Icon(color="red", icon="info-sign"),
).add_to(nyc)
#Adding Circle Markers for our # of Incidents:
for i in range (0,len(police_misconduct)):
folium.CircleMarker(
location=[police_misconduct.iloc[i]['Latitude'],police_misconduct.iloc[i]['Longitude']],
radius=float(police_misconduct.iloc[i]['# of Incidents']/20),
popup='Precinct: ' + police_misconduct.iloc[i]['Precinct'] + ' ' + 'Incidents: ' + str(police_misconduct.iloc[i]['# of Incidents']),
color="red",
fill=True,
fill_color="red",
).add_to(nyc)
nyc
Stacked Bar Graph of Most Common Age Group Arrested Based on Borough
#arrest_df.groupby('ARREST_BORO')['AGE_GROUP'].value_counts().unstack().plot.bar(stacked=True)
#plt.legend(loc="upper center",bbox_to_anchor=(1.60, 1.15), ncol=2)
#plt.title('Age Group of Arrested Perpretator by Borough')
#plt.xlabel('Borough')
#plt.ylabel('# of Arrest Made')
#plt.figure(figsize=(20,10))
#Swap to breakdown of arrest race to pop racial breakdown
#stack bar plot
#arrest breakdown
#35%b
#50w
#15asian
#actual pop breakdown
#15b
#75w
#10a
Bar Graph of Gender Arrested Based on Borough
#arrest_df.groupby('ARREST_BORO')['PERP_SEX'].value_counts().unstack().plot.bar()
#plt.legend(["Female","Male"],loc="upper center",bbox_to_anchor=(1.60, 1.15), ncol=2)
#plt.title('Gender of Arrested Perpretator by Borough')
#plt.xlabel('Borough')
#plt.ylabel('# of Arrest Made')
#plt.figure(figsize=(20,10))
Level of Offense Based on Borough
#df.groupby('ARREST_BORO')['LAW_CAT_CD'].value_counts().unstack().plot.bar()
#plt.legend(["Felony","Incident","Misdemeanor","Violation"],loc="upper center",bbox_to_anchor=(1.60, 1.15), ncol=2)
#plt.title('Level of Offense by Borough')
#plt.xlabel('Borough')
#plt.ylabel('# of Arrest Made')
#plt.figure(figsize=(20,10))
Filtering Arrests Determined as Felony, Misdemenor, and Violation
#felony_data = arrest_df.loc[arrest_df['LAW_CAT_CD'] == 'F'].count()[0]
#misdemenor_data = arrest_df.loc[arrest_df['LAW_CAT_CD'] == 'M'].count()[0]
#violation_data = arrest_df.loc[arrest_df['LAW_CAT_CD'] == 'V'].count()[0]
Pie Chart of Percentage of Each Level Offense for New York
#labels = ['Felony', 'Misdemeanor', 'Violation']
#plt.pie([felony_data, misdemenor_data, violation_data], labels=labels, autopct = '%.2f')
#plt.title('Level of Offense Percentage in New York')
#plt.show()